# -*- coding: utf-8 -*-


import scrapy
from test1.items import hc360cominfo
from scrapy import Request
from scrapy.loader import ItemLoader
from scrapyluke.processors import *
import datetime

class ChinahrSpider(scrapy.Spider):
    name = 'hc360'
    start_urls = ['http://cn.hc360.com/sh/']

    #发送请求，获取各地区的次级页面
    def parse(self, response):
        #地区编号
        for url_num in range(2,40):
            curr = "/html/body/div[@class='enterprise']/div[@class='class-con']/div[@class='class-left']/div[@class='top-box']/div[@class='areatop-con']/p[" + str(url_num) + "]/a/@href"
            curr_page = response.xpath(curr).extract()
            if curr_page:
                for i in curr_page:
                    yield scrapy.Request(i,callback=self.parse_jump)

    def parse_jump(self,response):
        #获取页面总数，发送获取各页面的请求
        page_total = response.xpath("/html/body/div[@class='enterprise']/div[@class='class-con']/div[@class='class-right']/div[@class='enterprise-page']/a[5]/text()").extract()
        if page_total:
            for url_num in range(1,(int(page_total[0])+1)):
                url = response.url + str(url_num) + '/'
                yield scrapy.Request(url,callback=self.parse_goto)

    def parse_goto(self,response):
        #print response.url
        url_list = response.xpath('//ul[@class = "right-con"]//a/@href').extract()
        for url in url_list:
            yield scrapy.Request(url,callback = self.parse_info)

    def parse_info(self,response):
        hc360 = hc360cominfo()
        #获取需要的字段，存入item中
        hc360['url'] = response.url
        hc360['insert_time'] = str(datetime.datetime.now())
        hc360['com_name'] = com_name = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/div[@class='intro-info']/ul/li[@class='intro-infotop']/h1/text()").extract_first()
        hc360['com_introduction'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/div[@class='intro-tabcon']/p/text()").extract_first()
        hc360['com_address'] =response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/div[@class='intro-info']/ul/li[3]/text()").extract_first()
        hc360['com_contact'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/div[@class='intro-info']/ul/li[4]/text()").extract_first()
        hc360['com_product'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[1]/td[1]/text()").extract_first()
        hc360['com_industry'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[1]/td[2]/text()").extract_first()
        hc360['com_type'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[2]/td[1]/text()").extract_first()
        hc360['com_model'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[2]/td[2]/text()").extract_first()
        hc360['com_regaddress'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[3]/td[1]/text()").extract_first()
        hc360['com_mngaddress'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[3]/td[2]/text()").extract_first()
        hc360['com_buildtime'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[4]/td[1]/text()").extract_first()
        hc360['com_representative'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[4]/td[2]/text()").extract_first()
        hc360['com_staffnum'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[5]/td[1]/text()").extract_first()
        hc360['com_annualturnover'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[5]/td[2]/text()").extract_first()
        hc360['com_brand'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[6]/td[1]/text()").extract_first()
        hc360['com_regcapital'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[6]/td[2]/text()").extract_first()
        hc360['com_customer'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[7]/td[1]/text()").extract_first()
        hc360['com_market'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[7]/td[2]/text()").extract_first()
        hc360['com_export'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[8]/td[1]/text()").extract_first()
        hc360['com_import'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[8]/td[2]/text()").extract_first()
        hc360['com_bank'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[9]/td[1]/text()").extract_first()
        hc360['com_bankaccount'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[9]/td[2]/text()").extract_first()
        hc360['com_isoem'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[10]/td[1]/text()").extract_first()
        hc360['com_developmentnum'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[10]/td[2]/text()").extract_first()
        hc360['com_monthlyproduction'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[11]/td[1]/text()").extract_first()
        hc360['com_roomarea'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[11]/td[2]/text()").extract_first()
        hc360['com_qualitycontrol'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[12]/td[1]/text()").extract_first()
        hc360['com_mngcertification'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[12]/td[2]/text()").extract_first()
        hc360['com_credentials'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[13]/td[1]/text()").extract_first()
        hc360['com_certificates'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[13]/td[2]/text()").extract_first()
        hc360['com_reference'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[14]/td[1]/text()").extract_first()
        hc360['memberevaluation'] = response.xpath("/html/body/div[@class='enterprise']/div[@class='intro']/div[@class='intro-left']/table/tr[14]/td[2]/text()").extract_first()
        return hc360























